HW8

Parnian kassraie - 93111185

November 30, 2016


Preparing the data

library(dplyr)
library(magrittr)
library(highcharter)

wdiEc <- read.csv("~/University/5th Semester/Data Analysis/Assignments/HW8/data/wdiEc.csv")
wdiHe <-read.csv("~/University/5th Semester/Data Analysis/Assignments/HW8/data/wdiHe.csv")
wdiEd <-read.csv("~/University/5th Semester/Data Analysis/Assignments/HW8/data/wdiEd.csv")

NotCountry <- c("WLD","HIC","OED","ECS","EUU","LMY","MIC","EMU","EAS",
           "NAC","UMC","EAP","LMC","LCN","ECA","LAC","MEA","ARB",
           "CEB","MNA","SSF","SSA","LDC","SST","FCS","OSS","HPC",
           "LIC")


filter(wdiEc,!(Country.Code %in% NotCountry)) -> wdiEc
filter(wdiEd,!(Country.Code %in% NotCountry)) -> wdiEd
filter(wdiHe,!(Country.Code %in% NotCountry)) -> wdiHe

I) Economy and Growth

1. Comparing Iran With Other Countries

The following Indicators are compared in the year 2013.

  • GDP per capita (current US$)/NY.GDP.PCAP.CD

  • GDP growth (annual %)/ NY.GDP.MKTP.KD.ZG

  • Inflation, consumer prices (annual %)/FP.CPI.TOTL.ZG

  • Technical cooperation grants (BoP, current US$)/BX.GRT.TECH.CD.WD

  • Total debt service (% of exports of goods, services and primary income)/DT.TDS.DECT.EX.ZS

Indicators = c("NY.GDP.PCAP.CD","NY.GDP.MKTP.KD.ZG","FP.CPI.TOTL.ZG","BX.GRT.TECH.CD.WD","DT.TDS.DECT.EX.ZS")
wdiEc %>% filter(Indicator.Code %in% Indicators, !is.na(X2013)) ->EcGr

A) GDP per capita (current US$)

Iran is Ranked 99 amongst 212 countries in the world.

EcGr %>% filter(Indicator.Code == "NY.GDP.PCAP.CD") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "GDP Per Capita (US$)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

B)GDP growth (annual %)

Iran is Ranked 199 amongst 212 countries in the world.

EcGr %>% filter(Indicator.Code == "NY.GDP.MKTP.KD.ZG") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "GDP growth (annual %)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

C) Inflation, consumer prices (annual %)

Iran is Ranked 2 amongst 212 countries in the world. No Surprise!

EcGr %>% filter(Indicator.Code == "FP.CPI.TOTL.ZG") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Inflation, consumer prices (annual %)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

D) Technical cooperation grants (BoP, current US$)

Iran is Ranked 53 amongst 212 countries in the world.

EcGr %>% filter(Indicator.Code ==  "BX.GRT.TECH.CD.WD") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Technical cooperation grants (BoP, current US$)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

E) Total debt service (% of exports of goods, services and primary income)

Iran is Ranked 107 amongst 212 countries in the world.

EcGr %>% filter(Indicator.Code == "DT.TDS.DECT.EX.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")

IransRank = filter(plot1,Country.Code=="IRN")$Ranking

hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Total debt service (% of exports of goods, services and primary income)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

2. Calculating PCAs

We calculate the mean value over the years for each indicator of each country, creating a new dataframe. Then we’ll reshape it to calculate the PCs.

# averaging on all the years
Values <- rowMeans(wdiEc[,5:60],na.rm = TRUE)
wdiEc2<- data.frame(wdiEc$Country.Code,wdiEc$Country.Name,wdiEc$Indicator.Code,Values)
names(wdiEc2) <- c("Country.Code","Country.Name","Indicator.Code","Values")

# reshaping the dataset
require(tidyr)
wdiEc2 %>% spread(Indicator.Code,Values)->wdiEc2
Names = wdiEc2[,1]
wdiEc2[is.na(wdiEc2)]=0
row.names(wdiEc2) = wdiEc2$Country.Code

#calculating PCs
PCmat <- prcomp(wdiEc2[,-c(1,2)], scale. = T, center = T)
plot(summary(PCmat)$importance[3,], type="l",
     ylab="%variance explained",
     xlab="nth component (decreasing order)")
abline(h=0.8, col="indianred")

We’ll be keeping 19 PCs, describing about 80% of the data’s variance.

# choosing first 19 PCs, and showing the feature vector
chosen.components = 1:19
feature.vector = PCmat$rotation[,chosen.components]

knitr:: kable(feature.vector[1:10,1:5])
PC1 PC2 PC3 PC4 PC5
BG.GSR.NFSV.GD.ZS 0.0125849 -0.0110843 -0.0514514 0.0106519 0.0857140
BM.GSR.CMCP.ZS -0.0118322 0.0026685 -0.0660256 0.0940998 0.0333804
BM.GSR.FCTY.CD -0.1196605 -0.0013176 0.0263301 -0.0003438 0.0367208
BM.GSR.GNFS.CD -0.1275100 0.0010212 0.0263476 0.0122738 0.0289926
BM.GSR.INSF.ZS -0.0048009 -0.0057565 -0.0556014 0.0180324 0.0384153
BM.GSR.MRCH.CD -0.1275197 0.0011001 0.0271734 0.0130584 0.0265560
BM.GSR.NFSV.CD -0.1256830 0.0005570 0.0257901 0.0106707 0.0391103
BM.GSR.ROYL.CD -0.1052432 0.0030626 -0.0165613 -0.0037154 0.0263410
BM.GSR.TOTL.CD -0.1275161 0.0006523 0.0249218 0.0099674 0.0295785
BM.GSR.TRAN.ZS 0.0061086 -0.0026510 0.0036186 0.1374019 -0.0278359
wdiEcomy  = cbind(Country.Code= wdiEc2$Country.Code,Country.Name = wdiEc2$Country.Name,data.frame(PCmat$x))

# Rank according to the first and second PC
wdiEcomy[,"Ranking1"]= rank(-wdiEcomy$PC1,ties.method = "min")
wdiEcomy[,"Ranking2"]= rank(-wdiEcomy$PC2,ties.method = "min")

Now let’s see how the first pc ranks the countries:

IransRank = filter(wdiEcomy,Country.Code=="IRN")$Ranking1
hchart( wdiEcomy, type = "column", x= Country.Name, y=Ranking1, group = Ranking1) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "PC1") %>%
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1))) %>% 
  hc_legend(enabled=F)

Iran is ranked 147 amongst 236 countries. the First countries are weaker (ecomony-wise) compared to the last ones. For example the first country is liberia and the last one is the United states. We can see that the first PC has saved the variance really well.

The first 20 countries according to the second PC are ranked below.

IransRank = filter(wdiEcomy,Country.Code=="IRN")$Ranking2

wdiEcomy = wdiEcomy[order(wdiEcomy$Ranking2),]
hchart( wdiEcomy[1:20,], type = "column", x= Country.Name[1:20], y=Ranking2[1:20], group = Ranking2[1:20]) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "PC2") %>%
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1))) %>% 
  hc_legend(enabled=F)

3. Clustering the Countries Based on Economical Factors

library(cluster)
library(fpc)

#Clustering on the first 19 pcs
clus <- kmeans(wdiEcomy[,3:22], centers=5)

clusplot(wdiEcomy[,3:22], clus$cluster, color=TRUE, shade=TRUE, 
         labels=2, lines=0)

Let’s find out about the countires inside each cluster.

k5Clus <- data.frame(clus$cluster)
k5Clus <- data.frame(rownames(k5Clus),clus$cluster)
names(k5Clus) <- c("Country.Code","Cluster")

countrynames = select(wdiEc2, Country.Name, Country.Code)
k5Clus = merge (countrynames, k5Clus, by = "Country.Code")

hchart(k5Clus, type = "column", x=Country.Name, y=Cluster, group = Cluster) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Members of Each Cluster regarding Economy") %>%
  hc_xAxis(title = list(text = "Country Name")) %>% 
  hc_yAxis(title = list(text = "Cluster"))

The countries similar to Iran Economical-wise are:

n <- k5Clus$Cluster[which(k5Clus$Country.Code=="IRN")]

k5Clus %>% filter(.,Cluster==n) -> temp

knitr::kable(temp)
Country.Code Country.Name Cluster
IDN Indonesia 3
IRN Iran, Islamic Rep. 3
VNM Vietnam 3

4. A Linear Model For coutries Growth

we reduce the data to the first PC. I’ll define Growth as weighted sum over the indicator differences in two consecutive years, thus, the weight of an indicator becomes it’s corresponding value in PC1. Note that the weight of each indicator in PC1, somehow show’s it’s importance in economical state of a country.

Let’s see the growth factor explained above for Iran, USA and iceland.

wdiEc %>% filter(Country.Code=="IRN") -> IranEc
IranEc[is.na(IranEc)] <- 0
wdiEc %>% filter(Country.Code=="ISL") -> IceEc
IceEc[is.na(IceEc)] <- 0
wdiEc %>% filter(Country.Code=="USA") -> USEc
USEc[is.na(USEc)] <- 0

weights <- PCmat$rotation[,1]

IranEc <- (-(weights %*% as.matrix(IranEc[,6:58])) + (weights %*% as.matrix(IranEc[,5:57])))
USEc <- -(weights %*% as.matrix(USEc[,6:58])) + (weights %*% as.matrix(USEc[,5:57]))
IceEc <- -(weights %*% as.matrix(IceEc[,6:58])) + (weights %*% as.matrix(IceEc[,5:57]))


highchart() %>% 
  hc_xAxis(categories = 1960:2012) %>% 
  hc_add_series(name = "IRAN",data = t(IranEc)) %>% 
  hc_add_series(name = "USA",data = t(USEc)) %>%
  hc_add_series(name = "Iceland",data = t(IceEc)) %>%
  hc_title(text = "Economical Growth") %>% 
  hc_add_theme(hc_theme_economist())

We can see that Iceland’s economy has been quite steady. United States has had a fall in 2008 (the housing crisis) and Iran has gotten worse overtime.

Back to the regression model, I’ll compute the growth between years 2009-2013, and then test it on years 2013/2014.

#selecting the columns
EcGrowth0 <- wdiEc[,c(1:4,54:60)]
EcGrowth0[is.na(EcGrowth0)] <- 0
numberOfCountries = length(levels(factor(EcGrowth0$Country.Code)))


#creating an empty dataset to fill it with growth values later
EcGrowth1 = matrix(nrow = numberOfCountries,ncol = 8)
EcGrowth1 = as.data.frame(EcGrowth1)
colnames(EcGrowth1) <- colnames(EcGrowth0)[c(2,5:11)]
EcGrowth1[,"Country.Code"] = levels(factor(EcGrowth0$Country.Code))
  

# calculating the annual growth for each country
for (i in 1:numberOfCountries) {
  temp <- filter(EcGrowth0,Country.Code==as.character(levels(factor(wdiEc$Country.Code))[i]))
  EcGrowth1[i,2:8] <- - weights %*% as.matrix(temp[,5:11])
}

#fitting the linear model
EcGrowth1 %>% mutate(Growth = 2*(X2014-X2013)) -> EcGrowth1
EcGrowth1[is.na(EcGrowth1)] <- 0

EcGrowthFit <- lm(Growth~X2009+X2010+X2011+X2012+X2013,EcGrowth1)
summary(EcGrowthFit)
## 
## Call:
## lm(formula = Growth ~ X2009 + X2010 + X2011 + X2012 + X2013, 
##     data = EcGrowth1)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.512e+12  8.383e+10  1.041e+11  1.067e+11  2.816e+12 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.048e+11  5.037e+10  -2.080  0.03867 *  
## X2009       -2.332e-01  7.170e-02  -3.253  0.00131 ** 
## X2010        1.902e-01  9.881e-02   1.925  0.05548 .  
## X2011        3.383e-02  2.902e-01   0.117  0.90730    
## X2012       -1.542e+00  2.239e-01  -6.886 5.43e-11 ***
## X2013        1.536e+00  7.277e-02  21.112  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.56e+11 on 230 degrees of freedom
## Multiple R-squared:  0.9599, Adjusted R-squared:  0.959 
## F-statistic:  1100 on 5 and 230 DF,  p-value: < 2.2e-16

Let’s predict iran’s ranking in 2014/2015 growth factor.

# loading the test data and changing the names so that the years are similar to the model we used
testdata<- EcGrowth1[,3:7]
colnames(testdata) <- colnames(EcGrowth1)[2:6]

EcGrowth1 <- data.frame(EcGrowth1,predict(EcGrowthFit,testdata))
EcGrowth1<- EcGrowth1[order(desc(EcGrowth1$predict.EcGrowthFit..testdata.)),]

which(EcGrowth1$Country.Code=="IRN")
## [1] 236

It’s the last country. What a surprise.


II) Health

1. Comparing Iran With Other Countries

The following Indicators are compared in the year 2013.

  • Number of under-five deaths / SH.DTH.MORT

  • Adults (ages 15+) newly infected with HIV / SH.HIV.INCD

  • Health expenditure per capita (current US$) / SH.XPD.PCAP

  • Population ages 65 and above (% of total) / SP.POP.65UP.TO.ZS

  • Life expectancy at birth, total (years) / SP.DYN.LE00.IN

Indicators = c("SH.DTH.MORT","SH.HIV.INCD","SH.XPD.PCAP","SP.POP.65UP.TO.ZS","SP.DYN.LE00.IN")
wdiHe %>% filter(Indicator.Code %in% Indicators, !is.na(X2013)) ->Health

A) Number of under-five deaths

Iran is Ranked 47 amongst 193 countries in the world.

Health %>% filter(Indicator.Code == "SH.DTH.MORT") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Number of under-five deaths") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

B)Adults (ages 15+) newly infected with HIV

Iran is Ranked 33 amongst 193 countries in the world.

Health %>% filter(Indicator.Code == "SH.HIV.INCD") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Adults (ages 15+) newly infected with HIV") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

C)Health expenditure per capita (current US$)

Iran is Ranked 96 amongst 193 countries in the world.

Health %>% filter(Indicator.Code == "SH.XPD.PCAP") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Health expenditure per capita (current US$)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

D)Population ages 65 and above (% of total)

Iran is Ranked 123 amongst 193 countries in the world.

Health %>% filter(Indicator.Code == "SP.POP.65UP.TO.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Population ages 65 and above (% of total)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

E)Life expectancy at birth, total (years)

Iran is Ranked 75 amongst 193 countries in the world.

Health %>% filter(Indicator.Code =="SP.DYN.LE00.IN") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Life expectancy at birth, total (years)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

2. Calculating PCAs

Similar to the Economics part we have:

# averaging on all the years
Values <- rowMeans(wdiHe[,5:60],na.rm = TRUE)
wdiHe2<- data.frame(wdiHe$Country.Code,wdiHe$Country.Name,wdiHe$Indicator.Code,Values)
names(wdiHe2) <- c("Country.Code","Country.Name","Indicator.Code","Values")

# reshaping the dataset
require(tidyr)
wdiHe2 %>% spread(Indicator.Code,Values)->wdiHe2
Names = wdiHe2[,1]
wdiHe2[is.na(wdiHe2)]=0
row.names(wdiHe2) = wdiHe2$Country.Code

#calculating PCs
PCmatHe <- prcomp(wdiHe2[,-c(1,2)], scale. = T, center = T)
plot(summary(PCmatHe)$importance[3,], type="l",
     ylab="%variance explained",
     xlab="nth component (decreasing order)")
abline(h=0.8, col="indianred")

We’ll be keeping 16 PCs, describing about 80% of the data’s variance.

# choosing first 16 PCs, and showing the feature vector
chosen.components = 1:16
feature.vector = PCmatHe$rotation[,chosen.components]

knitr:: kable(feature.vector[1:10,1:10])
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
SG.VAW.1549.ZS 0.0566705 0.0129454 0.0400020 -0.0418096 0.0613298 -0.0657096 0.0325744 -0.0341940 -0.0810918 0.1388954
SG.VAW.ARGU.ZS 0.0957454 0.0024007 0.0678380 0.0742735 0.0947387 0.0148493 0.0051016 0.1135740 -0.2454665 -0.0034066
SG.VAW.BURN.ZS 0.0924884 -0.0012441 0.0585134 0.0840545 0.0940960 0.0105724 0.0040134 0.1002811 -0.2354228 0.0132317
SG.VAW.GOES.ZS 0.0975128 0.0018089 0.0672851 0.0825888 0.0889987 0.0096868 -0.0013800 0.1248553 -0.2538857 0.0003699
SG.VAW.NEGL.ZS 0.0988371 0.0080845 0.0756527 0.0715211 0.0750927 0.0078051 0.0029061 0.1366565 -0.2463302 0.0207760
SG.VAW.REAS.ZS 0.1108268 0.0035733 0.0650819 0.0996065 0.0598048 0.0086884 -0.0127148 0.0786277 -0.1087742 -0.0358202
SG.VAW.REFU.ZS 0.0932232 -0.0052615 0.0569789 0.0917695 0.1004684 0.0121666 -0.0268809 0.1007971 -0.2493176 -0.0335532
SH.ANM.ALLW.ZS 0.1078361 0.0822623 -0.0796794 0.0110717 -0.0171977 -0.0024644 -0.0390208 -0.0206622 -0.0762716 0.0170956
SH.ANM.CHLD.ZS 0.1216283 0.0581052 -0.0734171 0.0033365 0.0127704 -0.0367793 -0.0484936 -0.0121745 -0.0568834 0.0124750
SH.ANM.NPRG.ZS 0.1073305 0.0883522 -0.0787823 0.0096301 -0.0096217 -0.0307600 -0.0253603 -0.0183766 -0.0722821 0.0105705
wdiHealth  = cbind(Country.Code= wdiHe2$Country.Code,Country.Name = wdiHe2$Country.Name,data.frame(PCmatHe$x))

# Rank according to the first and second PC
wdiHealth[,"Ranking1"]= rank(-wdiHealth$PC1,ties.method = "min")
wdiHealth[,"Ranking2"]= rank(-wdiHealth$PC2,ties.method = "min")

Now let’s see how the first pc ranks the countries:

IransRank = filter(wdiHealth,Country.Code=="IRN")$Ranking1
hchart( wdiHealth, type = "column", x= Country.Name, y=Ranking1, group = Ranking1) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "PC1") %>%
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1))) %>% 
  hc_legend(enabled=F)

Iran is ranked 109 amongst 236 countries. This is really amazing, almost all of the first countries are located in africa and are in a bad hygenic situation compared to the last ones which are well developed rich countries such as switzerland and sweden. This shows that the first PC has saved the variance really well.

The first 20 countries according to the second PC are ranked below.

IransRank = filter(wdiHealth,Country.Code=="IRN")$Ranking2

wdiHealth = wdiHealth[order(wdiHealth$Ranking2),]
hchart( wdiHealth[1:20,], type = "column", x= Country.Name[1:20], y=Ranking2[1:20], group = Ranking2[1:20]) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "PC2") %>%
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1))) %>% 
  hc_legend(enabled=F)

3. Clustering the Countries Based on Health Factors

library(cluster)
library(fpc)

#Clustering on the first 16 pcs
clus <- kmeans(wdiHealth[,3:19], centers=5)

clusplot(wdiHealth[,3:22], clus$cluster, color=TRUE, shade=TRUE, 
         labels=2, lines=0)

Let’s find out about the countires inside each cluster. This is just amazing! One of the clusters consists of only african countries!

k5Clus <- data.frame(clus$cluster)
k5Clus <- data.frame(rownames(k5Clus),clus$cluster)
names(k5Clus) <- c("Country.Code","Cluster")

countrynames = select(wdiHe2, Country.Name, Country.Code)
k5Clus = merge (countrynames, k5Clus, by = "Country.Code")

hchart(k5Clus, type = "column", x=Country.Name, y=Cluster, group = Cluster) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Members of Each Cluster Regardin Health") %>%
  hc_xAxis(title = list(text = "Country Name")) %>% 
  hc_yAxis(title = list(text = "Cluster"))

The countries similar to Iran Health-wise are:

n <- k5Clus$Cluster[which(k5Clus$Country.Code=="IRN")]

k5Clus %>% filter(.,Cluster==n) -> temp

knitr::kable(temp[1:10,])
Country.Code Country.Name Cluster
ALB Albania 4
ARG Argentina 4
ARM Armenia 4
AZE Azerbaijan 4
BIH Bosnia and Herzegovina 4
BLZ Belize 4
BOL Bolivia 4
BRA Brazil 4
BRB Barbados 4
BTN Bhutan 4

4. A Linear Model For coutries Growth

we reduce the data to the first PC. Growth factor is similar to what was defined before. Let’s see the growth factor explained above for Iran, USA and iceland.

wdiHe %>% filter(Country.Code=="IRN") -> IranHe
IranHe[is.na(IranHe)] <- 0
wdiHe %>% filter(Country.Code=="ISL") -> IceHe
IceHe[is.na(IceHe)] <- 0
wdiHe %>% filter(Country.Code=="USA") -> USHe
USHe[is.na(USHe)] <- 0

weights <- PCmatHe$rotation[,1]

IranHe <- (-(weights %*% as.matrix(IranHe[,6:58])) + (weights %*% as.matrix(IranHe[,5:57])))
USHe <- -(weights %*% as.matrix(USHe[,6:58])) + (weights %*% as.matrix(USHe[,5:57]))
IceHe <- -(weights %*% as.matrix(IceHe[,6:58])) + (weights %*% as.matrix(IceHe[,5:57]))


highchart() %>% 
  hc_xAxis(categories = 1960:2012) %>% 
  hc_add_series(name = "IRAN",data = t(IranHe)) %>% 
  hc_add_series(name = "USA",data = t(USHe)) %>%
  hc_add_series(name = "Iceland",data = t(IceHe)) %>%
  hc_title(text = "Health Growth") %>% 
  hc_add_theme(hc_theme_economist())

Again, Iceland is an steady (and beautiful country), United States Health growth oscillates and apparently Iran has been getting better in the last couple of years.

Back to the regression model, I’ll compute the growth between years 2009-2013, and then test it on years 2013/2014.

#selecting the columns
HeGrowth0 <- wdiHe[,c(1:4,54:60)]
HeGrowth0[is.na(HeGrowth0)] <- 0
numberOfCountries = length(levels(factor(HeGrowth0$Country.Code)))


#creating an empty dataset to fill it with growth values later
HeGrowth1 = matrix(nrow = numberOfCountries,ncol = 8)
HeGrowth1 = as.data.frame(HeGrowth1)
colnames(HeGrowth1) <- colnames(HeGrowth0)[c(2,5:11)]
HeGrowth1[,"Country.Code"] = levels(factor(HeGrowth0$Country.Code))
  

# calculating the annual growth for each country
for (i in 1:numberOfCountries) {
  temp <- filter(HeGrowth0,Country.Code==as.character(levels(factor(wdiHe$Country.Code))[i]))
  HeGrowth1[i,2:8] <- - weights %*% as.matrix(temp[,5:11])
}

#fitting the linear model
HeGrowth1 %>% mutate(Growth = 2*(X2014-X2013)) -> HeGrowth1
HeGrowth1[is.na(HeGrowth1)] <- 0

EcGrowthFit <- lm(Growth~X2009+X2010+X2011+X2012+X2013,HeGrowth1)
summary(EcGrowthFit)
## 
## Call:
## lm(formula = Growth ~ X2009 + X2010 + X2011 + X2012 + X2013, 
##     data = HeGrowth1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30192.0    -82.4    254.7    556.6  27717.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.586e+02  2.986e+02  -0.866    0.387    
## X2009        2.200e-02  5.927e-02   0.371    0.711    
## X2010        3.753e-02  8.598e-03   4.365 1.92e-05 ***
## X2011       -1.431e+00  1.199e-01 -11.940  < 2e-16 ***
## X2012       -1.699e-01  2.624e-02  -6.473 5.75e-10 ***
## X2013        1.539e+00  7.475e-02  20.593  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4411 on 230 degrees of freedom
## Multiple R-squared:  0.9938, Adjusted R-squared:  0.9937 
## F-statistic:  7421 on 5 and 230 DF,  p-value: < 2.2e-16

Let’s predict iran’s ranking in 2014/2015 growth factor.

# loading the test data and changing the names so that the years are similar to the model we used
testdata<- HeGrowth1[,3:7]
colnames(testdata) <- colnames(HeGrowth1)[2:6]

HeGrowth1 <- data.frame(HeGrowth1,predict(EcGrowthFit,testdata))
HeGrowth1<- HeGrowth1[order(desc(HeGrowth1$predict.EcGrowthFit..testdata.)),]

which(HeGrowth1$Country.Code=="IRN")
## [1] 120

Iran is ranked 120 amongst 236 countries. The first and last countries are Bangladesh and USA!


III) Education

1. Comparing Iran With Other Countries

  • Youth literacy rate, population 15-24 years, both sexes / SE.ADT.1524.LT.ZS
  • Duration of compulsory education (years) / SE.COM.DURS
  • Net enrolment rate, primary, both sexes (%) / SE.PRM.NENR
  • Government expenditure on education, total (% of GDP) / SE.XPD.TOTL.GD.ZS
  • Unemployment, total (% of total labor force) / SL.UEM.TOTL.ZS
Indicators = c("SE.ADT.1524.LT.ZS","SE.COM.DURS","SE.PRM.NENR","SE.XPD.TOTL.GD.ZS","SL.UEM.TOTL.ZS")
wdiEd %>% filter(Indicator.Code %in% Indicators, !is.na(X2013)) ->Education

A)Youth literacy rate, population 15-24 years, both sexes

Iran is Ranked 18 amongst 193 countries in the world.

Education %>% filter(Indicator.Code == "SE.ADT.1524.LT.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Youth literacy rate, population 15-24 years, both sexes") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

B)Duration of compulsory education (years)

Iran is Ranked 159 amongst 193 countries in the world.

Education %>% filter(Indicator.Code == "SE.COM.DURS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Duration of compulsory education (years)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

C)Net enrolment rate, primary, both sexes (%)

Iran is Ranked 15 amongst 193 countries in the world.

Education %>% filter(Indicator.Code == "SE.PRM.NENR") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Net enrolment rate, primary, both sexes (%)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

D)Government expenditure on education, total (% of GDP)

Iran is Ranked 64 amongst 193 countries in the world.

Education %>% filter(Indicator.Code == "SE.XPD.TOTL.GD.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Government expenditure on education, total (% of GDP)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

E) Unemployment, total (% of total labor force)

Iran is Ranked 35 amongst 193 countries in the world.

Education %>% filter(Indicator.Code == "SL.UEM.TOTL.ZS") %>% select(Country.Name,Country.Code,X2013)-> plot1
plot1[,"Ranking"] = rank(-plot1$X2013,ties.method = "min")
IransRank = filter(plot1,Country.Code=="IRN")$Ranking
hchart( plot1,type = "column", x= Country.Name, y=Ranking, group = Ranking) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Unemployment, total (% of total labor force)") %>% 
  hc_legend(enabled=F) %>% 
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1)))

2. Calculating PCAs

Similar to the Economics part we have:

# averaging on all the years
Values <- rowMeans(wdiEd[,5:60],na.rm = TRUE)
wdiEd2<- data.frame(wdiEd$Country.Code,wdiEd$Country.Name,wdiEd$Indicator.Code,Values)
names(wdiEd2) <- c("Country.Code","Country.Name","Indicator.Code","Values")

# reshaping the dataset
require(tidyr)
wdiEd2 %>% spread(Indicator.Code,Values)->wdiEd2
Names = wdiEd2[,1]
wdiEd2[is.na(wdiEd2)]=0
row.names(wdiEd2) = wdiEd2$Country.Code

#calculating PCs
PCmatEd <- prcomp(wdiEd2[,-c(1,2)], scale. = T, center = T)
plot(summary(PCmatEd)$importance[3,], type="l",
     ylab="%variance explained",
     xlab="nth component (decreasing order)")
abline(h=0.8, col="indianred")

We’ll be keeping 18 PCs, describing about 80% of the data’s variance.

# choosing first 18 PCs, and showing the feature vector
chosen.components = 1:18
feature.vector = PCmatEd$rotation[,chosen.components]

knitr:: kable(feature.vector[1:10,1:10])
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
SE.ADT.1524.LT.FE.ZS -0.0466829 -0.0903742 0.0485449 -0.0815209 0.0021185 -0.2575296 0.0788784 -0.1169873 0.0115377 -0.0861005
SE.ADT.1524.LT.FM.ZS -0.0361481 -0.0938534 -0.0412844 -0.0320101 -0.0366465 -0.2703352 0.0379462 -0.1119921 0.0006710 -0.0990561
SE.ADT.1524.LT.MA.ZS -0.0368997 -0.1036704 0.0508523 -0.0648474 0.0171239 -0.2596017 0.0743038 -0.0933686 0.0052279 -0.0874504
SE.ADT.1524.LT.ZS -0.0420739 -0.0971189 0.0500969 -0.0739121 0.0095893 -0.2593373 0.0769504 -0.1053578 0.0085523 -0.0873551
SE.ADT.LITR.FE.ZS -0.0590757 -0.0679555 0.0454932 -0.0985054 -0.0106339 -0.2455876 0.0842363 -0.1304639 0.0168291 -0.0751854
SE.ADT.LITR.MA.ZS -0.0459289 -0.0919128 0.0515688 -0.0776737 0.0075393 -0.2557143 0.0779425 -0.1007470 0.0129747 -0.0877970
SE.ADT.LITR.ZS -0.0529924 -0.0802100 0.0490544 -0.0893385 -0.0015594 -0.2521274 0.0817540 -0.1161200 0.0151838 -0.0825169
SE.COM.DURS -0.0763946 0.0302383 0.0193303 0.0163701 0.0128389 0.0555283 0.0430003 -0.1051711 0.0098574 0.0296149
SE.ENR.PRIM.FM.ZS -0.1150408 -0.0764697 0.0584873 0.0317977 -0.0432859 0.0333286 -0.1245975 -0.0451965 -0.0026453 0.0460988
SE.ENR.PRSC.FM.ZS -0.1183980 -0.0684360 0.0530856 0.0133121 -0.0464599 0.0437321 -0.0841046 -0.0412218 0.0006640 0.0794846
wdiEducate  = cbind(Country.Code= wdiEd2$Country.Code,Country.Name = wdiEd2$Country.Name,data.frame(PCmatEd$x))

# Rank according to the first and second PC
wdiEducate[,"Ranking1"]= rank(-wdiEducate$PC1,ties.method = "min")
wdiEducate[,"Ranking2"]= rank(-wdiEducate$PC2,ties.method = "min")

Now let’s see how the first pc ranks the countries:

IransRank = filter(wdiEducate,Country.Code=="IRN")$Ranking1
hchart( wdiEducate, type = "column", x= Country.Name, y=Ranking1, group = Ranking1) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "PC1") %>%
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1))) %>% 
  hc_legend(enabled=F)

Iran is ranked 150 amongst 236 countries. the First countries are rather small countries and the last ones are almost all europian. my guess is that this ranking has to do something with the school systems.

The first 20 countries according to the second PC are ranked below.

IransRank = filter(wdiEducate,Country.Code=="IRN")$Ranking2

wdiEducate = wdiEducate[order(wdiEducate$Ranking2),]
hchart( wdiEducate[1:20,], type = "column", x= Country.Name[1:20], y=Ranking2[1:20], group = Ranking2[1:20]) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "PC2") %>%
  hc_xAxis(title = list(text = "Country Name"),
           plotLines = list(
             list(label = list(text = "Iran"),
                  color = "#1f618d",
                  width = 2,
                  value = IransRank-1))) %>% 
  hc_legend(enabled=F)

3. Clustering the Countries Based on Educational Factors

library(cluster)
library(fpc)

#Clustering on the first 18 pcs
clus <- kmeans(wdiEducate[,3:21], centers=5)

clusplot(wdiEducate[,3:22], clus$cluster, color=TRUE, shade=TRUE, 
         labels=2, lines=0)

Let’s find out about the countires inside each cluster.

k5Clus <- data.frame(clus$cluster)
k5Clus <- data.frame(rownames(k5Clus),clus$cluster)
names(k5Clus) <- c("Country.Code","Cluster")

countrynames = select(wdiEd2, Country.Name, Country.Code)
k5Clus = merge (countrynames, k5Clus, by = "Country.Code")

hchart(k5Clus, type = "column", x=Country.Name, y=Cluster, group = Cluster) %>% 
  hc_add_theme(hc_theme_economist()) %>% 
  hc_title(text = "Members of Each Cluster Regarding Education") %>%
  hc_xAxis(title = list(text = "Country Name")) %>% 
  hc_yAxis(title = list(text = "Cluster"))

The countries similar to Iran Education-wise are:

n <- k5Clus$Cluster[which(k5Clus$Country.Code=="IRN")]

k5Clus %>% filter(.,Cluster==n) -> temp

knitr::kable(temp[1:10,])
Country.Code Country.Name Cluster
ABW Aruba 3
ALB Albania 3
ARE United Arab Emirates 3
ARG Argentina 3
AZE Azerbaijan 3
BHR Bahrain 3
BHS Bahamas, The 3
BIH Bosnia and Herzegovina 3
BLZ Belize 3
BOL Bolivia 3

4. A Linear Model For coutries Growth

we reduce the data to the first PC. Growth factor is similar to what was defined before. Let’s see the growth factor explained above for Iran, USA and iceland.

wdiEd %>% filter(Country.Code=="IRN") -> IranEd
IranEd[is.na(IranEd)] <- 0
wdiEd %>% filter(Country.Code=="ISL") -> IceEd
IceEd[is.na(IceEd)] <- 0
wdiEd %>% filter(Country.Code=="USA") -> USEd
USEd[is.na(USEd)] <- 0

weights <- PCmatEd$rotation[,1]

IranEd <- (-(weights %*% as.matrix(IranEd[,6:58])) + (weights %*% as.matrix(IranEd[,5:57])))
USEd <- -(weights %*% as.matrix(USEd[,6:58])) + (weights %*% as.matrix(USEd[,5:57]))
IceEd <- -(weights %*% as.matrix(IceEd[,6:58])) + (weights %*% as.matrix(IceEd[,5:57]))


highchart() %>% 
  hc_xAxis(categories = 1960:2012) %>% 
  hc_add_series(name = "IRAN",data = t(IranEd)) %>% 
  hc_add_series(name = "USA",data = t(USEd)) %>%
  hc_add_series(name = "Iceland",data = t(IceEd)) %>%
  hc_title(text = "Education Growth") %>% 
  hc_add_theme(hc_theme_economist())

As always, Iceland’s situation is amaizgly steady. Iran has had a drop in year 1989 which is 1368 in our calendar that can be explained by the “Imposed Wars” being started. In the last couple of years we don’t see that much progress in educational growth of these countries.

Back to the regression model, I’ll compute the growth between years 2009-2013, and then test it on years 2013/2014.

#selecting the columns
EdGrowth0 <- wdiEd[,c(1:4,54:60)]
EdGrowth0[is.na(EdGrowth0)] <- 0
numberOfCountries = length(levels(factor(EdGrowth0$Country.Code)))


#creating an empty dataset to fill it with growth values later
EdGrowth1 = matrix(nrow = numberOfCountries,ncol = 8)
EdGrowth1 = as.data.frame(EdGrowth1)
colnames(EdGrowth1) <- colnames(EdGrowth0)[c(2,5:11)]
EdGrowth1[,"Country.Code"] = levels(factor(EdGrowth0$Country.Code))
  

# calculating the annual growth for each country
for (i in 1:numberOfCountries) {
  temp <- filter(EdGrowth0,Country.Code==as.character(levels(factor(wdiEd$Country.Code))[i]))
  EdGrowth1[i,2:8] <- - weights %*% as.matrix(temp[,5:11])
}

#fitting the linear model
EdGrowth1 %>% mutate(Growth = 2*(X2014-X2013)) -> EdGrowth1
EdGrowth1[is.na(EdGrowth1)] <- 0

EdGrowthFit <- lm(Growth~X2009+X2010+X2011+X2012+X2013,EdGrowth1)
summary(EdGrowthFit)
## 
## Call:
## lm(formula = Growth ~ X2009 + X2010 + X2011 + X2012 + X2013, 
##     data = EdGrowth1)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -10996463     24840    100376    108600   4532026 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -1.086e+05  7.272e+04  -1.493  0.13676   
## X2009        7.236e+00  2.311e+00   3.131  0.00197 **
## X2010       -4.840e+00  2.852e+00  -1.697  0.09107 . 
## X2011        3.606e+00  2.991e+00   1.206  0.22921   
## X2012       -5.700e+00  2.717e+00  -2.098  0.03699 * 
## X2013       -4.256e-01  7.059e-01  -0.603  0.54720   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1082000 on 230 degrees of freedom
## Multiple R-squared:  0.819,  Adjusted R-squared:  0.8151 
## F-statistic: 208.2 on 5 and 230 DF,  p-value: < 2.2e-16

Let’s predict iran’s ranking in 2014/2015 growth factor.

# loading the test data and changing the names so that the years are similar to the model we used
testdata<- EdGrowth1[,3:7]
colnames(testdata) <- colnames(EdGrowth1)[2:6]

EdGrowth1 <- data.frame(EdGrowth1,predict(EdGrowthFit,testdata))
EdGrowth1<- EdGrowth1[order(desc(EdGrowth1$predict.EdGrowthFit..testdata.)),]

which(EdGrowth1$Country.Code=="IRN")
## [1] 28

It’s the 28th Country, Great news!